Loading Business and Reviews tables from Azure blob storage

In [1]:
yelp_business = spark.read.csv('wasb://main@yelpproject.blob.core.windows.net/yelp_business.csv', sep =',', header=True, mode="DROPMALFORMED", inferSchema=True)
yelp_business.write.saveAsTable("yelp_business", mode="overwrite")
yelp_business.printSchema()
Starting Spark application
IDYARN Application IDKindStateSpark UIDriver logCurrent session?
2application_1531743118469_0008pyspark3idleLinkLink✔
SparkSession available as 'spark'.
root
 |-- business_id: string (nullable = true)
 |-- name: string (nullable = true)
 |-- neighborhood: string (nullable = true)
 |-- address: string (nullable = true)
 |-- city: string (nullable = true)
 |-- state: string (nullable = true)
 |-- postal_code: string (nullable = true)
 |-- latitude: string (nullable = true)
 |-- longitude: string (nullable = true)
 |-- stars: string (nullable = true)
 |-- review_count: string (nullable = true)
 |-- is_open: double (nullable = true)
 |-- categories: string (nullable = true)
In [2]:
yelp_review = spark.read.csv('wasb://main@yelpproject.blob.core.windows.net/yelp_review_tab.csv', sep ='\t', header=True, mode="DROPMALFORMED", inferSchema=True)
yelp_review.write.saveAsTable("yelp_review", mode="overwrite")
yelp_review.printSchema()
root
 |-- _c0: string (nullable = true)
 |-- review_id: string (nullable = true)
 |-- user_id: string (nullable = true)
 |-- business_id: string (nullable = true)
 |-- stars: integer (nullable = true)
 |-- date: timestamp (nullable = true)
 |-- text: string (nullable = true)
 |-- useful: integer (nullable = true)
 |-- funny: integer (nullable = true)
 |-- cool: integer (nullable = true)

Visualisations (Businesses)

In [6]:
yelp_review.count()
3116417
In [7]:
# Subset open businesses and safe as temp table using pyspark
df_business = spark.sql("""
SELECT name, latitude, longitude, stars from yelp_business WHERE is_open IN (1)
""")
df_business.registerTempTable('df_business')
In [8]:
# Number of open bussinesses 
df_business.count()
106978
In [9]:
df_business.head(1)
[Row(name='"""Brown Brothers Automotive"""', latitude='33.4154661', longitude='-111.7908836', stars='3.0')]
In [15]:
%%sql -o buss_open
/* Subset open businesses and safe as temp table using magic sql command */
SELECT name, latitude, longitude, stars from yelp_business WHERE is_open IN (1)
name latitude longitude stars
0 """Brown Brothers Automotive""" 33.415466 -111.790884 3.0
1 """Westside Sports Complex""" 33.469867 -112.269622 2.5
2 """Valvoline Instant Oil Change""" 41.243435 -81.356043 5.0
3 """Arturo's Bar & Grille""" 41.411840 -81.915284 3.5
4 """PAPYRUS""" 33.623804 -111.925750 4.0
5 """Fired Pie""" 33.307875 -111.742584 4.0
6 """Truhit Fitness - Central Scottsdale""" 33.523540 -111.897175 4.0
7 """Da Paolo Trattoria & Sports Bar""" 43.858075 -79.850113 3.0
8 """Circle K Stores""" 33.487801 -111.986688 3.0
9 """Wendy's""" 35.202392 -80.734367 3.5
10 """Einstein Bros""" 36.143822 -115.294084 2.0
11 """Chandler Apartment Homes""" 36.041208 -115.240249 4.0
12 """Brush & Nib Studio""" 33.662944 -112.018233 2.0
13 """Stay-A-While Cat Shelter""" 41.306912 -81.750594 4.0
14 """Canine Cafe""" 35.202561 -80.866015 4.5
15 """iCracked""" 41.490803 -81.672676 5.0
16 """Heniford B Todd MD""" 35.205981 -80.838768 2.5
17 """Photobuddy""" 33.356528 -111.862755 5.0
18 """La Distillerie No.3""" 45.546028 -73.575917 4.5
19 """Paul Meredith - CityCan Financial Corporati... 43.702629 -79.397851 5.0
20 """Howard Johnson Cleveland Airport""" 41.403805 -81.815539 1.5
21 """District American Kitchen and Wine Bar""" 33.451746 -112.070249 3.5
22 """Charlotte RIPS""" 35.195970 -80.797922 5.0
23 """The Ecig Flavourium - Vape Boutique""" 43.653705 -79.373295 5.0
24 """Tropical Smoothie Cafe""" 36.108308 -115.060339 2.0
25 """Starbucks""" 43.766762 -79.475958 4.5
26 """BJ's Stamps & Coins""" 33.639301 -112.196093 4.0
27 """USO""" 48.684543 9.045675 5.0
28 """Sofi's Mexican Bar & Grill""" 43.783560 -79.170583 4.0
29 """Clean Machine Car Wash""" 41.156015 -81.311795 2.5
... ... ... ... ...
2470 """Starbucks""" 45.493982 -73.578751 4.5
2471 """Wellington Apartments""" 43.047903 -89.481730 2.5
2472 """Amsterdam Barrel House""" 43.705730 -79.361508 3.0
2473 """Tea N Bannock""" 43.671182 -79.327356 4.5
2474 """McDonald's""" 35.148784 -80.832955 2.5
2475 """KidsPark""" 33.597863 -111.978126 4.5
2476 """Talk Nail and Spa""" 33.495001 -112.201741 4.0
2477 """First United Methodist Child Care Center""" 40.118623 -88.246454 5.0
2478 """Baymont Inn And Suites Las Vegas""" 36.049116 -115.169470 3.0
2479 """Advance Auto Parts""" 35.209367 -80.672839 1.0
2480 """The McCormick Scottsdale""" 33.547957 -111.924799 4.0
2481 """Crafton Ice Cream Delite""" 40.438401 -80.069193 4.5
2482 """Albert's Tire & Service Center""" 40.430730 -80.000175 4.5
2483 """Scéna""" 45.504318 -73.549567 3.5
2484 """Blooming Rose Esthetics""" 43.861744 -79.505322 2.5
2485 """Charlotte Endodontics""" 35.208427 -80.825276 4.5
2486 """US Nails & Spa""" 36.155885 -115.206782 3.0
2487 """Harambe Afrika""" 48.740800 9.153560 3.5
2488 """License Agency #1826""" 41.395847 -81.641560 3.0
2489 """Rj's Auto Pros""" 41.099906 -81.382270 5.0
2490 """Flynn's Tire & Auto Service""" 41.155604 -81.325395 3.5
2491 """Gourmet Goombahs""" 35.007370 -80.945076 3.5
2492 """Enduro Sport""" 43.705859 -79.362088 3.5
2493 """Rest-O-Vieux Poele""" 45.063789 -73.317637 4.5
2494 """The Ballet""" 43.649062 -79.420478 3.5
2495 """Whist Stove & Spirits""" 36.020968 -115.086202 3.5
2496 """Los Paisas""" 35.089829 -80.858629 4.5
2497 """Bett Lounge""" 48.781325 9.177313 4.0
2498 """StainPro""" 33.638182 -112.270476 1.5
2499 """Pine Ridge Country Club""" 41.606328 -81.449137 4.0

2500 rows × 4 columns

First simple visualization

In [16]:
%%local
%matplotlib inline
import matplotlib.pyplot as plt
import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot
init_notebook_mode(connected=True)


scl = [ [0,"rgb(5, 10, 172)"],[0.35,"rgb(40, 60, 190)"],[0.5,"rgb(70, 100, 245)"],\
    [0.6,"rgb(90, 120, 245)"],[0.7,"rgb(106, 137, 247)"],[1,"rgb(220, 220, 220)"] ]

data = [ dict(
        type = 'scattergeo',
        locationmode = 'USA-states',
        lon = buss_open['longitude'],
        lat = buss_open['latitude'],
        text = buss_open['name'],
        mode = 'markers',
        marker = dict(
            size = 8,
            opacity = 0.8,
            reversescale = True,
            autocolorscale = False,
            symbol = 'square',
            line = dict(
                width=1,
                color='rgba(102, 102, 102)'
            ),
            colorscale = scl,
            cmin = 0,
            color = buss_open['stars'],
            cmax = buss_open['stars'].max(),
            colorbar=dict(
                title="Ratings"
            )
        ))]

layout = dict(
        title = 'Yelp business',
        colorbar = True,
        geo = dict(
            scope='usa',
            projection=dict( type='albers usa' ),
            showland = True,
            landcolor = "rgb(250, 250, 250)",
            subunitcolor = "rgb(217, 217, 217)",
            countrycolor = "rgb(217, 217, 217)",
            countrywidth = 0.5,
            subunitwidth = 0.5
        ),
    )

fig = dict( data=data, layout=layout )
iplot(fig, validate=False, filename='business_on_map')

Review ratings

In [12]:
yelp_review.groupBy("stars").count().show()
+-----+-------+
|stars|  count|
+-----+-------+
|    1| 462610|
|    3| 285211|
|    5|1520985|
|    4| 621761|
|    2| 225850|
+-----+-------+
In [17]:
%%sql
SELECT stars, COUNT(*) as N FROM yelp_review GROUP BY stars
In [ ]: